library(drake)
library(tidyverse)
loadd(data, data_trans, data_mice)
Look at NA values in raw data
map_int(data, ~sum(!is.na(.)))
## iso3c n_amr_events n_amr_first_events health_expend_perc
## 190 59 59 183
## migrant_pop_per_capita population ab_export_bin english_spoken
## 190 190 162 190
## human_consumption_ddd livestock_consumption_kg_per_pcu livestock_pcu ab_export_per_capita
## 68 164 164 87
## ab_import_per_capita livestock_consumption_kg_per_capita gdp_per_capita tourism_outbound_per_capita
## 161 31 190 44
## tourism_inbound_per_capita pubcrawl_per_capita promed_mentions_per_capita
## 111 180 189
data %>%
filter(pubcrawl_per_capita == 0 | is.na(pubcrawl_per_capita)) %>%
nrow()
## [1] 33
data %>%
filter(promed_mentions_per_capita == 0 | is.na( promed_mentions_per_capita)) %>%
nrow()
## [1] 1
data %>%
filter(ab_export_per_capita == 0 | is.na( ab_export_per_capita)) %>%
nrow()
## [1] 103
Look at NA values and distributions post-NA processing
map_int(data_trans, ~sum(!is.na(.)))
## iso3c n_amr_events health_expend_perc ln_migrant_pop_per_capita
## 190 190 183 190
## ln_population ab_export_bin english_spoken human_consumption_ddd
## 190 190 190 68
## ln_livestock_pcu ln_ab_export_per_capita ln_ab_import_per_capita ln_livestock_consumption_kg_per_capita
## 164 190 161 31
## ln_gdp_per_capita ln_tourism_outbound_per_capita ln_tourism_inbound_per_capita ln_pubcrawl_per_capita
## 190 44 111 190
## ln_promed_mentions_per_capita
## 190
map_lgl(data_trans, ~any(is.infinite(.))) # confirm no infinite values
## iso3c n_amr_events health_expend_perc ln_migrant_pop_per_capita
## FALSE FALSE FALSE FALSE
## ln_population ab_export_bin english_spoken human_consumption_ddd
## FALSE FALSE FALSE FALSE
## ln_livestock_pcu ln_ab_export_per_capita ln_ab_import_per_capita ln_livestock_consumption_kg_per_capita
## FALSE FALSE FALSE FALSE
## ln_gdp_per_capita ln_tourism_outbound_per_capita ln_tourism_inbound_per_capita ln_pubcrawl_per_capita
## FALSE FALSE FALSE FALSE
## ln_promed_mentions_per_capita
## FALSE
data_trans %>%
select(-iso3c, -n_amr_events, -english_spoken) %>%
gather() %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(key~., scales = "free")
data_trans %>%
dplyr::select(-iso3c, -ln_livestock_pcu, -ln_ab_import_per_capita, -ab_export_bin, -english_spoken) %>%
PerformanceAnalytics::chart.Correlation(., histogram = TRUE, pch = 19, method = "spearman")
Look at imputed data
plot(data_mice) # On convergence, the different streams should be freely intermingled with one another, without showing any definite trends. Convergence is diagnosed when the variance between different sequences is no larger than the variance within each individual sequence.
show_imputes(data_mice, m = data_mice[["m"]], raw = data_trans)
imp <- complete(data_mice)
imp %>%
dplyr::select(-iso3c) %>%
PerformanceAnalytics::chart.Correlation(., histogram = TRUE, pch = 19, method = "spearman")